{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'electra-small-generator-bahasa-cased'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import ElectraTokenizer, ElectraModel, ElectraConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('electra-small-generator-bahasa-cased/vocab.txt',\n",
       " 'electra-small-generator-bahasa-cased/special_tokens_map.json',\n",
       " 'electra-small-generator-bahasa-cased/added_tokens.json')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = ElectraTokenizer('bahasa.wordpiece', do_lower_case = False)\n",
    "tokenizer.save_pretrained('electra-small-generator-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "import torch\n",
    "\n",
    "from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra\n",
    "\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "\n",
    "\n",
    "def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):\n",
    "    # Initialise PyTorch model\n",
    "    config = ElectraConfig.from_json_file(config_file)\n",
    "    print(\"Building PyTorch model from configuration: {}\".format(str(config)))\n",
    "\n",
    "    if discriminator_or_generator == \"discriminator\":\n",
    "        model = ElectraForPreTraining(config)\n",
    "    elif discriminator_or_generator == \"generator\":\n",
    "        model = ElectraForMaskedLM(config)\n",
    "    else:\n",
    "        raise ValueError(\"The discriminator_or_generator argument should be either 'discriminator' or 'generator'\")\n",
    "\n",
    "    # Load weights from tf checkpoint\n",
    "    load_tf_weights_in_electra(\n",
    "        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator\n",
    "    )\n",
    "\n",
    "    # Save pytorch-model\n",
    "    print(\"Save PyTorch model to {}\".format(pytorch_dump_path))\n",
    "    torch.save(model.state_dict(), pytorch_dump_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building PyTorch model from configuration: ElectraConfig {\n",
      "  \"_num_labels\": 2,\n",
      "  \"architectures\": null,\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bad_words_ids\": null,\n",
      "  \"bos_token_id\": null,\n",
      "  \"decoder_start_token_id\": null,\n",
      "  \"do_sample\": false,\n",
      "  \"early_stopping\": false,\n",
      "  \"embedding_size\": 128,\n",
      "  \"eos_token_id\": null,\n",
      "  \"finetuning_task\": null,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 64,\n",
      "  \"id2label\": {\n",
      "    \"0\": \"LABEL_0\",\n",
      "    \"1\": \"LABEL_1\"\n",
      "  },\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 256,\n",
      "  \"is_decoder\": false,\n",
      "  \"is_encoder_decoder\": false,\n",
      "  \"label2id\": {\n",
      "    \"LABEL_0\": 0,\n",
      "    \"LABEL_1\": 1\n",
      "  },\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"length_penalty\": 1.0,\n",
      "  \"max_length\": 20,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"min_length\": 0,\n",
      "  \"model_type\": \"electra\",\n",
      "  \"no_repeat_ngram_size\": 0,\n",
      "  \"num_attention_heads\": 1,\n",
      "  \"num_beams\": 1,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"num_return_sequences\": 1,\n",
      "  \"output_attentions\": false,\n",
      "  \"output_hidden_states\": false,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"prefix\": null,\n",
      "  \"pruned_heads\": {},\n",
      "  \"repetition_penalty\": 1.0,\n",
      "  \"task_specific_params\": null,\n",
      "  \"temperature\": 1.0,\n",
      "  \"top_k\": 50,\n",
      "  \"top_p\": 1.0,\n",
      "  \"torchscript\": false,\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"use_bfloat16\": false,\n",
      "  \"vocab_size\": 32000\n",
      "}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.modeling_electra:Converting TensorFlow checkpoint from /home/husein/electra/electra/dataset/models/bahasa-small/model.ckpt-200000\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/bias with shape [1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/bias/adam_m with shape [1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/bias/adam_v with shape [1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/kernel with shape [256, 1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/kernel/adam_m with shape [256, 1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight discriminator_predictions/dense_1/kernel/adam_v with shape [256, 1]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/beta with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/beta/adam_m with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/beta/adam_v with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/gamma with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/gamma/adam_m with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/LayerNorm/gamma/adam_v with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/position_embeddings with shape [512, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/position_embeddings/adam_m with shape [512, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/position_embeddings/adam_v with shape [512, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/token_type_embeddings with shape [2, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/token_type_embeddings/adam_m with shape [2, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/token_type_embeddings/adam_v with shape [2, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/word_embeddings with shape [32000, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/word_embeddings/adam_m with shape [32000, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings/word_embeddings/adam_v with shape [32000, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings_project/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings_project/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings_project/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings_project/kernel with shape [128, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings_project/kernel/adam_m with shape [128, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/embeddings_project/kernel/adam_v with shape [128, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_0/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_1/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_10/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_11/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_2/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_3/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_4/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_5/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_6/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_7/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_8/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/output/dense/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/key/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/query/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/kernel with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/kernel/adam_m with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/attention/self/value/kernel/adam_v with shape [256, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/bias with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/bias/adam_m with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/bias/adam_v with shape [1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/kernel with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/kernel/adam_m with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/intermediate/dense/kernel/adam_v with shape [256, 1024]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/beta with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/beta/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/beta/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/gamma with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/gamma/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/LayerNorm/gamma/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/kernel with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/kernel/adam_m with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight electra/encoder/layer_9/output/dense/kernel/adam_v with shape [1024, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/kernel with shape [128, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/kernel/adam_m with shape [128, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/embeddings_project/kernel/adam_v with shape [128, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_0/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_1/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_10/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_11/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_2/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_3/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_4/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_5/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_6/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_7/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_8/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/output/dense/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/key/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/query/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/kernel with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/kernel/adam_m with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/attention/self/value/kernel/adam_v with shape [64, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/bias with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/bias/adam_m with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/bias/adam_v with shape [256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/kernel with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/kernel/adam_m with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/intermediate/dense/kernel/adam_v with shape [64, 256]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/beta with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/beta/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/beta/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/gamma with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/gamma/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/LayerNorm/gamma/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/bias with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/bias/adam_m with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/bias/adam_v with shape [64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/kernel with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/kernel/adam_m with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator/encoder/layer_9/output/dense/kernel/adam_v with shape [256, 64]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/beta with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/beta/adam_m with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/beta/adam_v with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/gamma with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/gamma/adam_m with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/LayerNorm/gamma/adam_v with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/bias with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/bias/adam_m with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/bias/adam_v with shape [128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/kernel with shape [64, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/kernel/adam_m with shape [64, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/dense/kernel/adam_v with shape [64, 128]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/output_bias with shape [32000]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/output_bias/adam_m with shape [32000]\n",
      "INFO:transformers.modeling_electra:Loading TF weight generator_predictions/output_bias/adam_v with shape [32000]\n",
      "INFO:transformers.modeling_electra:Loading TF weight global_step with shape []\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skipping discriminator_predictions/dense/bias ['discriminator_predictions', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense/bias/adam_m ['discriminator_predictions', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense/bias/adam_v ['discriminator_predictions', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense/kernel ['discriminator_predictions', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense/kernel/adam_m ['discriminator_predictions', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense/kernel/adam_v ['discriminator_predictions', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense_1/bias ['discriminator_predictions', 'dense_prediction', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense_1/bias/adam_m ['discriminator_predictions', 'dense_prediction', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense_1/bias/adam_v ['discriminator_predictions', 'dense_prediction', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense_1/kernel ['discriminator_predictions', 'dense_prediction', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense_1/kernel/adam_m ['discriminator_predictions', 'dense_prediction', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Skipping discriminator_predictions/dense_1/kernel/adam_v ['discriminator_predictions', 'dense_prediction', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator_predictions'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'LayerNorm', 'beta'] electra/embeddings/LayerNorm/beta\n",
      "Skipping electra/embeddings/LayerNorm/beta/adam_m ['electra', 'embeddings', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/LayerNorm/beta/adam_v ['electra', 'embeddings', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'LayerNorm', 'gamma'] electra/embeddings/LayerNorm/gamma\n",
      "Skipping electra/embeddings/LayerNorm/gamma/adam_m ['electra', 'embeddings', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/LayerNorm/gamma/adam_v ['electra', 'embeddings', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'position_embeddings'] electra/embeddings/position_embeddings\n",
      "Skipping electra/embeddings/position_embeddings/adam_m ['electra', 'embeddings', 'position_embeddings', 'adam_m'] 'Embedding' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/position_embeddings/adam_v ['electra', 'embeddings', 'position_embeddings', 'adam_v'] 'Embedding' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'token_type_embeddings'] electra/embeddings/token_type_embeddings\n",
      "Skipping electra/embeddings/token_type_embeddings/adam_m ['electra', 'embeddings', 'token_type_embeddings', 'adam_m'] 'Embedding' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/token_type_embeddings/adam_v ['electra', 'embeddings', 'token_type_embeddings', 'adam_v'] 'Embedding' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings', 'word_embeddings'] electra/embeddings/word_embeddings\n",
      "Skipping electra/embeddings/word_embeddings/adam_m ['electra', 'embeddings', 'word_embeddings', 'adam_m'] 'Embedding' object has no attribute 'adam_m'\n",
      "Skipping electra/embeddings/word_embeddings/adam_v ['electra', 'embeddings', 'word_embeddings', 'adam_v'] 'Embedding' object has no attribute 'adam_v'\n",
      "Skipping electra/embeddings_project/bias ['discriminator', 'embeddings_project', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/embeddings_project/bias/adam_m ['discriminator', 'embeddings_project', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/embeddings_project/bias/adam_v ['discriminator', 'embeddings_project', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/embeddings_project/kernel ['discriminator', 'embeddings_project', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/embeddings_project/kernel/adam_m ['discriminator', 'embeddings_project', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/embeddings_project/kernel/adam_v ['discriminator', 'embeddings_project', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/bias ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/bias ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/kernel ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/bias ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/kernel ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/bias ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/kernel ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/bias ['discriminator', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/dense/bias ['discriminator', 'encoder', 'layer_0', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/dense/kernel ['discriminator', 'encoder', 'layer_0', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_0/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/bias ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/bias ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/kernel ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/bias ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/kernel ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/bias ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/kernel ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/bias ['discriminator', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/dense/bias ['discriminator', 'encoder', 'layer_1', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/dense/kernel ['discriminator', 'encoder', 'layer_1', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_1/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/bias ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/bias ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/kernel ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/bias ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/kernel ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/bias ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/kernel ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/bias ['discriminator', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/dense/bias ['discriminator', 'encoder', 'layer_10', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/dense/kernel ['discriminator', 'encoder', 'layer_10', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_10/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/bias ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/bias ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/kernel ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/bias ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/kernel ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/bias ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/kernel ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/bias ['discriminator', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/dense/bias ['discriminator', 'encoder', 'layer_11', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/dense/kernel ['discriminator', 'encoder', 'layer_11', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_11/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/bias ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/bias ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/kernel ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/bias ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/kernel ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/bias ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/kernel ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/bias ['discriminator', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/dense/bias ['discriminator', 'encoder', 'layer_2', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/dense/kernel ['discriminator', 'encoder', 'layer_2', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_2/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/bias ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/bias ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/kernel ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/bias ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/kernel ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/bias ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/kernel ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/bias ['discriminator', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/dense/bias ['discriminator', 'encoder', 'layer_3', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/dense/kernel ['discriminator', 'encoder', 'layer_3', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_3/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/bias ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/bias ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/kernel ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/bias ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/kernel ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/bias ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/kernel ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/bias ['discriminator', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/dense/bias ['discriminator', 'encoder', 'layer_4', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/dense/kernel ['discriminator', 'encoder', 'layer_4', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_4/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/bias ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/bias ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/kernel ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/bias ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/kernel ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/bias ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/kernel ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/bias ['discriminator', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/dense/bias ['discriminator', 'encoder', 'layer_5', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/dense/kernel ['discriminator', 'encoder', 'layer_5', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_5/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/bias ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/bias ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/kernel ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/bias ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/kernel ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/bias ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/kernel ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/bias ['discriminator', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/dense/bias ['discriminator', 'encoder', 'layer_6', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/dense/kernel ['discriminator', 'encoder', 'layer_6', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_6/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/bias ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/bias ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/kernel ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/bias ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/kernel ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/bias ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/kernel ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/bias ['discriminator', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/dense/bias ['discriminator', 'encoder', 'layer_7', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/dense/kernel ['discriminator', 'encoder', 'layer_7', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_7/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/bias ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/bias ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/kernel ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/bias ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/kernel ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/bias ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/kernel ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/bias ['discriminator', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/dense/bias ['discriminator', 'encoder', 'layer_8', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/dense/kernel ['discriminator', 'encoder', 'layer_8', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_8/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/bias ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/kernel ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/bias ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/bias/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/bias/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/kernel ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/kernel/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/key/kernel/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/bias ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/bias/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/bias/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/kernel ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/kernel/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/query/kernel/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/bias ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/bias/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/bias/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/kernel ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/kernel/adam_m ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/attention/self/value/kernel/adam_v ['discriminator', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/bias ['discriminator', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/bias/adam_m ['discriminator', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/bias/adam_v ['discriminator', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/kernel ['discriminator', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/intermediate/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/beta ['discriminator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/beta/adam_m ['discriminator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/beta/adam_v ['discriminator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/gamma ['discriminator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/gamma/adam_m ['discriminator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/LayerNorm/gamma/adam_v ['discriminator', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/dense/bias ['discriminator', 'encoder', 'layer_9', 'output', 'dense', 'bias'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/dense/bias/adam_m ['discriminator', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/dense/bias/adam_v ['discriminator', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/dense/kernel ['discriminator', 'encoder', 'layer_9', 'output', 'dense', 'kernel'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/dense/kernel/adam_m ['discriminator', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_m'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Skipping electra/encoder/layer_9/output/dense/kernel/adam_v ['discriminator', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_v'] 'ElectraForMaskedLM' object has no attribute 'discriminator'\n",
      "Initialize PyTorch weight ['electra', 'embeddings_project', 'bias'] generator/embeddings_project/bias\n",
      "Skipping generator/embeddings_project/bias/adam_m ['electra', 'embeddings_project', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/embeddings_project/bias/adam_v ['electra', 'embeddings_project', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'embeddings_project', 'kernel'] generator/embeddings_project/kernel\n",
      "Skipping generator/embeddings_project/kernel/adam_m ['electra', 'embeddings_project', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/embeddings_project/kernel/adam_v ['electra', 'embeddings_project', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_0/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_0/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_0/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_0/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_0/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_0/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_0/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_0/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_0/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_0/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_0/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_0/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_0/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_0/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_0/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_0/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_0', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias'] generator/encoder/layer_0/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_0/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_0', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_0/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_0/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_0', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'dense', 'bias'] generator/encoder/layer_0/output/dense/bias\n",
      "Skipping generator/encoder/layer_0/output/dense/bias/adam_m ['electra', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/output/dense/bias/adam_v ['electra', 'encoder', 'layer_0', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_0', 'output', 'dense', 'kernel'] generator/encoder/layer_0/output/dense/kernel\n",
      "Skipping generator/encoder/layer_0/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_0/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_0', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_1/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_1/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.modeling_electra:Skipping global_step\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_1/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_1/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_1/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_1/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_1/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_1/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_1/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_1/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_1/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_1/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_1/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_1/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_1/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_1/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_1', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias'] generator/encoder/layer_1/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_1/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_1', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_1/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_1/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_1', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'dense', 'bias'] generator/encoder/layer_1/output/dense/bias\n",
      "Skipping generator/encoder/layer_1/output/dense/bias/adam_m ['electra', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/output/dense/bias/adam_v ['electra', 'encoder', 'layer_1', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_1', 'output', 'dense', 'kernel'] generator/encoder/layer_1/output/dense/kernel\n",
      "Skipping generator/encoder/layer_1/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_1/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_1', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_10/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_10/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_10/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_10/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_10/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_10/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_10/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_10/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_10/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_10/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_10/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_10/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_10/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_10/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_10/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_10/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_10', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias'] generator/encoder/layer_10/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_10/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_10', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_10/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_10/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_10', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'dense', 'bias'] generator/encoder/layer_10/output/dense/bias\n",
      "Skipping generator/encoder/layer_10/output/dense/bias/adam_m ['electra', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/output/dense/bias/adam_v ['electra', 'encoder', 'layer_10', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_10', 'output', 'dense', 'kernel'] generator/encoder/layer_10/output/dense/kernel\n",
      "Skipping generator/encoder/layer_10/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_10/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_10', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_11/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_11/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_11/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_11/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_11/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_11/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_11/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_11/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_11/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_11/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_11/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_11/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_11/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_11/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_11/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_11/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_11', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias'] generator/encoder/layer_11/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_11/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_11', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_11/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_11/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_11', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'dense', 'bias'] generator/encoder/layer_11/output/dense/bias\n",
      "Skipping generator/encoder/layer_11/output/dense/bias/adam_m ['electra', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/output/dense/bias/adam_v ['electra', 'encoder', 'layer_11', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_11', 'output', 'dense', 'kernel'] generator/encoder/layer_11/output/dense/kernel\n",
      "Skipping generator/encoder/layer_11/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_11/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_11', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_2/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_2/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_2/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_2/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_2/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_2/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_2/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_2/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_2/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_2/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_2/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_2/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_2/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_2/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_2/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_2/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_2', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias'] generator/encoder/layer_2/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_2/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_2', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_2/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_2/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_2', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'dense', 'bias'] generator/encoder/layer_2/output/dense/bias\n",
      "Skipping generator/encoder/layer_2/output/dense/bias/adam_m ['electra', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/output/dense/bias/adam_v ['electra', 'encoder', 'layer_2', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_2', 'output', 'dense', 'kernel'] generator/encoder/layer_2/output/dense/kernel\n",
      "Skipping generator/encoder/layer_2/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_2/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_2', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_3/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_3/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_3/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_3/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_3/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_3/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_3/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_3/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_3/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_3/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_3/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_3/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_3/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_3/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_3/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_3/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_3', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias'] generator/encoder/layer_3/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_3/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_3', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_3/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_3/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_3', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'dense', 'bias'] generator/encoder/layer_3/output/dense/bias\n",
      "Skipping generator/encoder/layer_3/output/dense/bias/adam_m ['electra', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/output/dense/bias/adam_v ['electra', 'encoder', 'layer_3', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_3', 'output', 'dense', 'kernel'] generator/encoder/layer_3/output/dense/kernel\n",
      "Skipping generator/encoder/layer_3/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_3/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_3', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_4/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_4/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_4/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_4/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_4/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_4/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_4/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_4/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_4/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_4/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_4/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_4/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_4/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_4/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_4/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_4/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_4', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias'] generator/encoder/layer_4/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_4/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_4', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_4/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_4/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_4', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'dense', 'bias'] generator/encoder/layer_4/output/dense/bias\n",
      "Skipping generator/encoder/layer_4/output/dense/bias/adam_m ['electra', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/output/dense/bias/adam_v ['electra', 'encoder', 'layer_4', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_4', 'output', 'dense', 'kernel'] generator/encoder/layer_4/output/dense/kernel\n",
      "Skipping generator/encoder/layer_4/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_4/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_4', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_5/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_5/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_5/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_5/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_5/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_5/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_5/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_5/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_5/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_5/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_5/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_5/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_5/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_5/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_5/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_5/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_5', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias'] generator/encoder/layer_5/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_5/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_5', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_5/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_5/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_5', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'dense', 'bias'] generator/encoder/layer_5/output/dense/bias\n",
      "Skipping generator/encoder/layer_5/output/dense/bias/adam_m ['electra', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/output/dense/bias/adam_v ['electra', 'encoder', 'layer_5', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_5', 'output', 'dense', 'kernel'] generator/encoder/layer_5/output/dense/kernel\n",
      "Skipping generator/encoder/layer_5/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_5/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_5', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_6/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_6/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_6/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_6/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_6/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_6/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_6/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_6/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_6/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_6/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_6/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_6/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_6/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_6/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_6/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_6/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias'] generator/encoder/layer_6/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_6/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_6', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_6/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_6/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_6', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'dense', 'bias'] generator/encoder/layer_6/output/dense/bias\n",
      "Skipping generator/encoder/layer_6/output/dense/bias/adam_m ['electra', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/output/dense/bias/adam_v ['electra', 'encoder', 'layer_6', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_6', 'output', 'dense', 'kernel'] generator/encoder/layer_6/output/dense/kernel\n",
      "Skipping generator/encoder/layer_6/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_6/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_6', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_7/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_7/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_7/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_7/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_7/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_7/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_7/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_7/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_7/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_7/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_7/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_7/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_7/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_7/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_7/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_7/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_7', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias'] generator/encoder/layer_7/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_7/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_7', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_7/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_7/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_7', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'dense', 'bias'] generator/encoder/layer_7/output/dense/bias\n",
      "Skipping generator/encoder/layer_7/output/dense/bias/adam_m ['electra', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/output/dense/bias/adam_v ['electra', 'encoder', 'layer_7', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_7', 'output', 'dense', 'kernel'] generator/encoder/layer_7/output/dense/kernel\n",
      "Skipping generator/encoder/layer_7/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_7/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_7', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_8/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_8/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_8/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_8/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_8/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_8/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_8/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_8/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_8/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_8/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_8/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_8/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_8/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_8/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_8/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_8/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_8', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias'] generator/encoder/layer_8/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_8/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_8', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_8/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_8/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_8', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'dense', 'bias'] generator/encoder/layer_8/output/dense/bias\n",
      "Skipping generator/encoder/layer_8/output/dense/bias/adam_m ['electra', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/output/dense/bias/adam_v ['electra', 'encoder', 'layer_8', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_8', 'output', 'dense', 'kernel'] generator/encoder/layer_8/output/dense/kernel\n",
      "Skipping generator/encoder/layer_8/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_8/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_8', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_9/attention/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_9/attention/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias'] generator/encoder/layer_9/attention/output/dense/bias\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel'] generator/encoder/layer_9/attention/output/dense/kernel\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias'] generator/encoder/layer_9/attention/self/key/bias\n",
      "Skipping generator/encoder/layer_9/attention/self/key/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel'] generator/encoder/layer_9/attention/self/key/kernel\n",
      "Skipping generator/encoder/layer_9/attention/self/key/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/self/key/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'key', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias'] generator/encoder/layer_9/attention/self/query/bias\n",
      "Skipping generator/encoder/layer_9/attention/self/query/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel'] generator/encoder/layer_9/attention/self/query/kernel\n",
      "Skipping generator/encoder/layer_9/attention/self/query/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/self/query/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'query', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias'] generator/encoder/layer_9/attention/self/value/bias\n",
      "Skipping generator/encoder/layer_9/attention/self/value/bias/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/bias/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel'] generator/encoder/layer_9/attention/self/value/kernel\n",
      "Skipping generator/encoder/layer_9/attention/self/value/kernel/adam_m ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/attention/self/value/kernel/adam_v ['electra', 'encoder', 'layer_9', 'attention', 'self', 'value', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias'] generator/encoder/layer_9/intermediate/dense/bias\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/bias/adam_m ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/bias/adam_v ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel'] generator/encoder/layer_9/intermediate/dense/kernel\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/kernel/adam_m ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/intermediate/dense/kernel/adam_v ['electra', 'encoder', 'layer_9', 'intermediate', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta'] generator/encoder/layer_9/output/LayerNorm/beta\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/beta/adam_m ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/beta/adam_v ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma'] generator/encoder/layer_9/output/LayerNorm/gamma\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/gamma/adam_m ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/output/LayerNorm/gamma/adam_v ['electra', 'encoder', 'layer_9', 'output', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'dense', 'bias'] generator/encoder/layer_9/output/dense/bias\n",
      "Skipping generator/encoder/layer_9/output/dense/bias/adam_m ['electra', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/output/dense/bias/adam_v ['electra', 'encoder', 'layer_9', 'output', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['electra', 'encoder', 'layer_9', 'output', 'dense', 'kernel'] generator/encoder/layer_9/output/dense/kernel\n",
      "Skipping generator/encoder/layer_9/output/dense/kernel/adam_m ['electra', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator/encoder/layer_9/output/dense/kernel/adam_v ['electra', 'encoder', 'layer_9', 'output', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['generator_predictions', 'LayerNorm', 'beta'] generator_predictions/LayerNorm/beta\n",
      "Skipping generator_predictions/LayerNorm/beta/adam_m ['generator_predictions', 'LayerNorm', 'beta', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator_predictions/LayerNorm/beta/adam_v ['generator_predictions', 'LayerNorm', 'beta', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['generator_predictions', 'LayerNorm', 'gamma'] generator_predictions/LayerNorm/gamma\n",
      "Skipping generator_predictions/LayerNorm/gamma/adam_m ['generator_predictions', 'LayerNorm', 'gamma', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator_predictions/LayerNorm/gamma/adam_v ['generator_predictions', 'LayerNorm', 'gamma', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['generator_predictions', 'dense', 'bias'] generator_predictions/dense/bias\n",
      "Skipping generator_predictions/dense/bias/adam_m ['generator_predictions', 'dense', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator_predictions/dense/bias/adam_v ['generator_predictions', 'dense', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['generator_predictions', 'dense', 'kernel'] generator_predictions/dense/kernel\n",
      "Skipping generator_predictions/dense/kernel/adam_m ['generator_predictions', 'dense', 'kernel', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator_predictions/dense/kernel/adam_v ['generator_predictions', 'dense', 'kernel', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Initialize PyTorch weight ['generator_lm_head', 'bias'] generator_predictions/output_bias\n",
      "Skipping generator_predictions/output_bias/adam_m ['generator_lm_head', 'bias', 'adam_m'] 'Parameter' object has no attribute 'adam_m'\n",
      "Skipping generator_predictions/output_bias/adam_v ['generator_lm_head', 'bias', 'adam_v'] 'Parameter' object has no attribute 'adam_v'\n",
      "Save PyTorch model to electra-small-generator-bahasa-cased/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "convert_tf_checkpoint_to_pytorch('dataset/models/bahasa-small/model.ckpt-200000', \n",
    "                                 'SMALL-config-generator.json', \n",
    "                                 'electra-small-generator-bahasa-cased/pytorch_model.bin',\n",
    "                                'generator')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.tokenization_utils:Model name './electra-small-generator-bahasa-cased' not found in model shortcut name list (google/electra-small-generator, google/electra-base-generator, google/electra-large-generator, google/electra-small-discriminator, google/electra-base-discriminator, google/electra-large-discriminator). Assuming './electra-small-generator-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.\n",
      "INFO:transformers.tokenization_utils:Didn't find file ./electra-small-generator-bahasa-cased/added_tokens.json. We won't load it.\n",
      "INFO:transformers.tokenization_utils:loading file ./electra-small-generator-bahasa-cased/vocab.txt\n",
      "INFO:transformers.tokenization_utils:loading file None\n",
      "INFO:transformers.tokenization_utils:loading file ./electra-small-generator-bahasa-cased/special_tokens_map.json\n",
      "INFO:transformers.tokenization_utils:loading file ./electra-small-generator-bahasa-cased/tokenizer_config.json\n"
     ]
    }
   ],
   "source": [
    "tokenizer = ElectraTokenizer.from_pretrained('./electra-small-generator-bahasa-cased', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ElectraConfig {\n",
       "  \"_num_labels\": 2,\n",
       "  \"architectures\": null,\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"bad_words_ids\": null,\n",
       "  \"bos_token_id\": null,\n",
       "  \"decoder_start_token_id\": null,\n",
       "  \"do_sample\": false,\n",
       "  \"early_stopping\": false,\n",
       "  \"embedding_size\": 128,\n",
       "  \"eos_token_id\": null,\n",
       "  \"finetuning_task\": null,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
       "  \"hidden_size\": 64,\n",
       "  \"id2label\": {\n",
       "    \"0\": \"LABEL_0\",\n",
       "    \"1\": \"LABEL_1\"\n",
       "  },\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 256,\n",
       "  \"is_decoder\": false,\n",
       "  \"is_encoder_decoder\": false,\n",
       "  \"label2id\": {\n",
       "    \"LABEL_0\": 0,\n",
       "    \"LABEL_1\": 1\n",
       "  },\n",
       "  \"layer_norm_eps\": 1e-12,\n",
       "  \"length_penalty\": 1.0,\n",
       "  \"max_length\": 20,\n",
       "  \"max_position_embeddings\": 512,\n",
       "  \"min_length\": 0,\n",
       "  \"model_type\": \"electra\",\n",
       "  \"no_repeat_ngram_size\": 0,\n",
       "  \"num_attention_heads\": 1,\n",
       "  \"num_beams\": 1,\n",
       "  \"num_hidden_layers\": 12,\n",
       "  \"num_return_sequences\": 1,\n",
       "  \"output_attentions\": false,\n",
       "  \"output_hidden_states\": false,\n",
       "  \"output_past\": true,\n",
       "  \"pad_token_id\": 0,\n",
       "  \"prefix\": null,\n",
       "  \"pruned_heads\": {},\n",
       "  \"repetition_penalty\": 1.0,\n",
       "  \"task_specific_params\": null,\n",
       "  \"temperature\": 1.0,\n",
       "  \"top_k\": 50,\n",
       "  \"top_p\": 1.0,\n",
       "  \"torchscript\": false,\n",
       "  \"type_vocab_size\": 2,\n",
       "  \"use_bfloat16\": false,\n",
       "  \"vocab_size\": 32000\n",
       "}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config = ElectraConfig('SMALL-config-generator.json')\n",
    "config.vocab_size = 32000\n",
    "config.hidden_size = 64\n",
    "config.intermediate_size = 256\n",
    "config.num_attention_heads = 1\n",
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.modeling_utils:loading weights file ./electra-small-generator-bahasa-cased/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('./electra-small-generator-bahasa-cased/pytorch_model.bin', \n",
    "                                            config = config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan rendang [SEP]',\n",
       "  'score': 0.11076924204826355,\n",
       "  'token': 6288},\n",
       " {'sequence': '[CLS] makan ayam dengan ketupat [SEP]',\n",
       "  'score': 0.1011609435081482,\n",
       "  'token': 8481},\n",
       " {'sequence': '[CLS] makan ayam dengan ikan [SEP]',\n",
       "  'score': 0.07039457559585571,\n",
       "  'token': 3359},\n",
       " {'sequence': '[CLS] makan ayam dengan kacang [SEP]',\n",
       "  'score': 0.05665124952793121,\n",
       "  'token': 5215},\n",
       " {'sequence': '[CLS] makan ayam dengan nasi [SEP]',\n",
       "  'score': 0.03904604911804199,\n",
       "  'token': 2533}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.configuration_utils:Configuration saved in electra-small-generator-bahasa-cased/config.json\n",
      "INFO:transformers.modeling_utils:Model weights saved in electra-small-generator-bahasa-cased/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "model.save_pretrained('electra-small-generator-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli upload ./electra-small-generator-bahasa-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:filelock:Lock 140215527145200 acquired on /home/husein/.cache/torch/transformers/2225b7e40d6f1dfaefbfb59741dd8fab71eed7fa38652ede72b07abf86c00502.502ff23bb8428870558596270344537005b56a6e1ab20e52182f06dbeafdd834.lock\n",
      "INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/config.json not found in cache or force_download set to True, downloading to /home/husein/.cache/torch/transformers/tmp_yoih5_e\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "328970d4e87b48029638fe47ffc0a639",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1263.0, style=ProgressStyle(description…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/config.json in cache at /home/husein/.cache/torch/transformers/2225b7e40d6f1dfaefbfb59741dd8fab71eed7fa38652ede72b07abf86c00502.502ff23bb8428870558596270344537005b56a6e1ab20e52182f06dbeafdd834\n",
      "INFO:transformers.file_utils:creating metadata file for /home/husein/.cache/torch/transformers/2225b7e40d6f1dfaefbfb59741dd8fab71eed7fa38652ede72b07abf86c00502.502ff23bb8428870558596270344537005b56a6e1ab20e52182f06dbeafdd834\n",
      "INFO:filelock:Lock 140215527145200 released on /home/husein/.cache/torch/transformers/2225b7e40d6f1dfaefbfb59741dd8fab71eed7fa38652ede72b07abf86c00502.502ff23bb8428870558596270344537005b56a6e1ab20e52182f06dbeafdd834.lock\n",
      "INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/config.json from cache at /home/husein/.cache/torch/transformers/2225b7e40d6f1dfaefbfb59741dd8fab71eed7fa38652ede72b07abf86c00502.502ff23bb8428870558596270344537005b56a6e1ab20e52182f06dbeafdd834\n",
      "INFO:transformers.configuration_utils:Model config ElectraConfig {\n",
      "  \"_num_labels\": 2,\n",
      "  \"architectures\": [\n",
      "    \"ElectraForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bad_words_ids\": null,\n",
      "  \"bos_token_id\": null,\n",
      "  \"decoder_start_token_id\": null,\n",
      "  \"do_sample\": false,\n",
      "  \"early_stopping\": false,\n",
      "  \"embedding_size\": 128,\n",
      "  \"eos_token_id\": null,\n",
      "  \"finetuning_task\": null,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 64,\n",
      "  \"id2label\": {\n",
      "    \"0\": \"LABEL_0\",\n",
      "    \"1\": \"LABEL_1\"\n",
      "  },\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 256,\n",
      "  \"is_decoder\": false,\n",
      "  \"is_encoder_decoder\": false,\n",
      "  \"label2id\": {\n",
      "    \"LABEL_0\": 0,\n",
      "    \"LABEL_1\": 1\n",
      "  },\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"length_penalty\": 1.0,\n",
      "  \"max_length\": 20,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"min_length\": 0,\n",
      "  \"model_type\": \"electra\",\n",
      "  \"no_repeat_ngram_size\": 0,\n",
      "  \"num_attention_heads\": 1,\n",
      "  \"num_beams\": 1,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"num_return_sequences\": 1,\n",
      "  \"output_attentions\": false,\n",
      "  \"output_hidden_states\": false,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"prefix\": null,\n",
      "  \"pruned_heads\": {},\n",
      "  \"repetition_penalty\": 1.0,\n",
      "  \"task_specific_params\": null,\n",
      "  \"temperature\": 1.0,\n",
      "  \"top_k\": 50,\n",
      "  \"top_p\": 1.0,\n",
      "  \"torchscript\": false,\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"use_bfloat16\": false,\n",
      "  \"vocab_size\": 32000\n",
      "}\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:filelock:Lock 140219560065848 acquired on /home/husein/.cache/torch/transformers/77d547f6bae4a3e87edd6eb8759032304c97568beff85f8f449b387bd2aef67f.6bbea526681b394cd000fa09458d947028d3ae398fa0d7c3c7101cb7d3b93399.lock\n",
      "INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/pytorch_model.bin not found in cache or force_download set to True, downloading to /home/husein/.cache/torch/transformers/tmpqu75xxuq\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "27042a1ade31449884010b905b9b1fcb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=19289548.0, style=ProgressStyle(descrip…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/pytorch_model.bin in cache at /home/husein/.cache/torch/transformers/77d547f6bae4a3e87edd6eb8759032304c97568beff85f8f449b387bd2aef67f.6bbea526681b394cd000fa09458d947028d3ae398fa0d7c3c7101cb7d3b93399\n",
      "INFO:transformers.file_utils:creating metadata file for /home/husein/.cache/torch/transformers/77d547f6bae4a3e87edd6eb8759032304c97568beff85f8f449b387bd2aef67f.6bbea526681b394cd000fa09458d947028d3ae398fa0d7c3c7101cb7d3b93399\n",
      "INFO:filelock:Lock 140219560065848 released on /home/husein/.cache/torch/transformers/77d547f6bae4a3e87edd6eb8759032304c97568beff85f8f449b387bd2aef67f.6bbea526681b394cd000fa09458d947028d3ae398fa0d7c3c7101cb7d3b93399.lock\n",
      "INFO:transformers.modeling_utils:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/pytorch_model.bin from cache at /home/husein/.cache/torch/transformers/77d547f6bae4a3e87edd6eb8759032304c97568beff85f8f449b387bd2aef67f.6bbea526681b394cd000fa09458d947028d3ae398fa0d7c3c7101cb7d3b93399\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('huseinzol05/electra-small-generator-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:transformers.tokenization_utils:Model name 'huseinzol05/electra-small-generator-bahasa-cased' not found in model shortcut name list (google/electra-small-generator, google/electra-base-generator, google/electra-large-generator, google/electra-small-discriminator, google/electra-base-discriminator, google/electra-large-discriminator). Assuming 'huseinzol05/electra-small-generator-bahasa-cased' is a path, a model identifier, or url to a directory containing tokenizer files.\n",
      "INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/vocab.txt from cache at /home/husein/.cache/torch/transformers/7394163ff5a5c8288c8fb14f990279b09c999238f7f999631fb90ff269b20aef.1e0699ac9cf6143f19e7441cfbfd684e785a5287fc6f61b4fe526a166d3f27bd\n",
      "INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/added_tokens.json from cache at None\n",
      "INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/special_tokens_map.json from cache at /home/husein/.cache/torch/transformers/bb91a9896aab1b108721d457ca66a00f6a689239a8b31b6a07f36f5ad17eef80.275045728fbf41c11d3dae08b8742c054377e18d92cc7b72b6351152a99b64e4\n",
      "INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/huseinzol05/electra-small-generator-bahasa-cased/tokenizer_config.json from cache at /home/husein/.cache/torch/transformers/0d35547113ab099b8b030912ca11afe6762b159c7209792503f5eb28a9bce79e.3889713104075cfee9e96090bcdd0dc753733b3db9da20d1dd8b2cd1030536a2\n"
     ]
    }
   ],
   "source": [
    "tokenizer = ElectraTokenizer.from_pretrained('huseinzol05/electra-small-generator-bahasa-cased', \n",
    "                                             do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan rendang [SEP]',\n",
       "  'score': 0.11076924204826355,\n",
       "  'token': 6288},\n",
       " {'sequence': '[CLS] makan ayam dengan ketupat [SEP]',\n",
       "  'score': 0.1011609435081482,\n",
       "  'token': 8481},\n",
       " {'sequence': '[CLS] makan ayam dengan ikan [SEP]',\n",
       "  'score': 0.07039457559585571,\n",
       "  'token': 3359},\n",
       " {'sequence': '[CLS] makan ayam dengan kacang [SEP]',\n",
       "  'score': 0.05665124952793121,\n",
       "  'token': 5215},\n",
       " {'sequence': '[CLS] makan ayam dengan nasi [SEP]',\n",
       "  'score': 0.03904604911804199,\n",
       "  'token': 2533}]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)\n",
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
